# package loading
library(RnaSeqSampleSize)
## Loading required package: ggplot2
## Loading required package: RnaSeqSampleSizeData
## Loading required package: edgeR
## Loading required package: limma
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
library(ggplot2)
library(biomaRt)
set.seed(538734517)
GSE184156 a human cell-line bulk RNASeq
experiment on extra-cellular matrix vs. no matrix as our reference
data-set for power analysis.FDR=0.05, m=14065, m1=78 (respectively false
discovery rate, total genes in experiment, expected number of prognostic
genes) , we see n>=17 as the threshold to reach
sufficient power.# file parsing and cleaning
d <- read.delim("/Users/paulcao/Downloads/GSE184156_RNAseq_ECM_rnaseq_featurecounts.txt",
header = TRUE, sep = "\t", skip = 1)
dataMat <- d[, c(10, 11, 12, 16, 17, 18)]
rownames(dataMat) <- d$Geneid
colnames(dataMat) <- c("LIVER1", "LIVER2", "LIVER3", "NO13", "NO14", "NO15")
# drop rows containing n/a dataMat <- na.omit(dataMat)
head(dataMat)
## LIVER1 LIVER2 LIVER3 NO13 NO14 NO15
## DDX11L1 0 0 0 0 0 0
## WASH7P 63 69 37 69 53 48
## MIR6859-1 20 16 8 28 21 20
## MIR1302-2HG 0 0 0 0 0 0
## MIR1302-2 0 0 0 0 0 0
## FAM138A 0 0 0 0 0 0
dim(dataMat)
## [1] 39011 6
genelist <- read.delim("/Users/paulcao/cadherin_and_integrin.txt", header=TRUE, sep="\t")
genelist <- genelist[, c(2, 3)]
colnames(genelist) <- c("Gene.name", "Gene.stable.ID")
head(genelist)
## Gene.name Gene.stable.ID
## 1 CDH1 GC16P068737
## 2 CDH2 GC18M028088
## 3 CDH5 GC16P066366
## 4 CDH3 GC16P068637
## 5 CDH13 GC16P082626
## 6 CDH11 GC16M064943
dim(genelist)
## [1] 78 2
RNAseqsamplesize [@Zhao2018] was used to do the analysis. With expected fold change between groups = 2, FDR set = 0.01 and number of samples = 5, power was computed to be 0.0727 for the selected genes of interest which is very low. Dispersion was also computed as 0.00385.
# estimate gene read count and dispersion distribution
distribution <- est_count_dispersion(dataMat, group = c(0, 0, 0, 1, 1, 1))
## Disp = 0.00385 , BCV = 0.062
# power estimation some gene IDs are not in dataset because they are alleles on
# alternative sequences (rapsn and Fbxo32)
power <- est_power_distribution(n = 5, f = 0.01, rho = 2, distributionObject = distribution,
selectedGenes = genelist$Gene.name, storeProcess = TRUE)
mean(power$power) # 0.07266288
## [1] 0.07266288
power2 <- est_power_distribution(n = 6, f = 0.01, rho = 2, distributionObject = distribution,
selectedGenes = genelist$Gene.name, storeProcess = TRUE)
mean(power2$power)
## [1] 0.1261367
From changing around FDR (fdr parameter in
est_power_curve()) and coverage (lambda0
parameter in est_power_curve()) as well as making an
optimization plot, it appears that at least 10 samples are needed with
an average coverage > 15 reads/gene depending on desired FDR in order
to achieve >80% power. To be precise, 18 samples with 10 coverage and
FDR=0.05 will give 83% power. To be safe, for the genes of interest we
would need at least 21 samples to achieve 80% power.
Red line is FDR=0.01, coverage=5. Blue line is FDR=0.05, coverage=5. Purple line is FDR=0.01, coverage=10. Green line is FDR=0.05, coverage=10. Yellow line is FDR=0.05, coverage=20.
Blue to brown gradient shows power from 0 to 1. Here FDR=0.01.
est_power(n = 8, lambda0 = 20, phi0 = 0.07154, f = 0.05, m = 52000, m1 = 71)
## [1] 0.51
power10 <- est_power_distribution(n = 10, f = 0.05, rho = 2, distributionObject = distribution,
selectedGenes = genelist$Gene.name, storeProcess = TRUE)
## Warning in selecteGeneByName(selectedGenes, distributionObject): 33 selectedGenes were not found in distributionObject, discard them for further analysis
mean(power10$power)
## [1] 0.5767527
power13 <- est_power_distribution(n = 13, f = 0.05, rho = 2, distributionObject = distribution,
selectedGenes = genelist$Gene.name, storeProcess = TRUE)
## Warning in selecteGeneByName(selectedGenes, distributionObject): 33 selectedGenes were not found in distributionObject, discard them for further analysis
mean(power13$power)
## [1] 0.7247843
power15 <- est_power_distribution(n = 15, f = 0.05, rho = 2, distributionObject = distribution,
selectedGenes = genelist$Gene.name, storeProcess = TRUE)
## Warning in selecteGeneByName(selectedGenes, distributionObject): 33 selectedGenes were not found in distributionObject, discard them for further analysis
mean(power15$power)
## [1] 0.7799651
power18 <- est_power_distribution(n = 18, f = 0.05, rho = 2, minAveCount = 15, distributionObject = distribution,
selectedGenes = genelist$Gene.name, storeProcess = TRUE)
## Warning in selecteGeneByName(selectedGenes, distributionObject): 33 selectedGenes were not found in distributionObject, discard them for further analysis
mean(power18$power)
## [1] 0.8300565
gene_readcounts <- distribution$pseudo.counts.mean[which(names(distribution$pseudo.counts.mean) %in% genelist$Gene.name)]
gene_dispersions <- distribution$tagwise.dispersion[which(names(distribution$pseudo.counts.mean) %in% genelist$Gene.name)]
head(gene_readcounts)
## ITGB3BP CELSR2 ITGA10 ITGB1BP1 ITGB6 ITGA6
## 448.611665 18530.060807 31.379411 2193.086763 1.070137 136.017278
mean(gene_readcounts)
## [1] 1384.193
mean(gene_dispersions)
## [1] 0.02174131
ggplot(data=data.frame(counts=as.numeric(gene_readcounts),name=as.character(genelist$Gene.name[genelist$Gene.name %in% names(gene_readcounts)]))) + geom_bar(aes(x=reorder(name,-counts),y=counts),stat="identity") + scale_y_log10() + coord_flip() + xlab("Gene name") + ylab("Mean count per gene across samples") + theme(axis.text.y = element_text(size=7)) + ggtitle("Mean count per gene across samples from experimental data") + labs(caption="Each sample has approximately 50 million reads. Experiment ID GSE58669")
gene_readcounts
## ITGB3BP CELSR2 ITGA10 ITGB1BP1 ITGB6 ITGA6
## 4.486117e+02 1.853006e+04 3.137941e+01 2.193087e+03 1.070137e+00 1.360173e+02
## ITGAV ITGA9 CELSR3 LOC100130345 ITGB5 IBSP
## 2.250075e+03 4.167625e+00 1.274939e+03 3.811665e+00 3.468786e+03 1.028280e+01
## CDH18 CDH10 ITGA1 ITGA2 CDHR2 ITGB8
## 2.441045e+01 1.718927e+00 3.695383e+01 4.949955e+02 2.130482e+00 1.155578e+01
## CDHR3 CPED1 ITGB1 ILK ITFG2 ITGB7
## 5.084312e+01 3.680381e+00 4.714837e+03 1.733754e+03 6.097740e+02 3.540124e+01
## ITGA5 ITGA7 CDH24 ITGAL ITGAM ITGAX
## 2.553918e+02 4.614826e+01 1.193567e+03 2.505413e+00 3.133014e+01 4.150712e+00
## ITFG1 CDH5 CDH3 CDH1 CDH15 ITGAE
## 8.586064e+02 3.100385e+00 2.984464e+03 1.360194e+04 2.725789e+01 6.049293e+02
## ITGA2B ITGB3 ITGA3 ITGB4 CDH22 CDH26
## 3.647727e+02 1.068397e+00 1.682625e+03 1.274514e+03 9.652477e-01 5.922976e+01
## ITGB2 CELSR1 ITGB1BP2
## 2.753119e+01 3.178135e+03 1.410967e+01
ss<-sample_size_distribution(f=0.05,distributionObject=distribution,selectedGenes=genelist$Gene.name,storeProcess = TRUE)
## Warning in selecteGeneByName(selectedGenes, distributionObject): 33 selectedGenes were not found in distributionObject, discard them for further analysis
ss
## $iter
## [1] 7
##
## $f.root
## [1] 0.0155463
##
## $root
## [1] 16
##
## $process
## N Power
## [1,] 1 0.005460439
## [2,] 9 0.587380911
## [3,] 13 0.756353866
## [4,] 15 0.799667357
## [5,] 16 0.815546304
## [6,] 17 0.829616720
## [7,] 33 0.925866132
##
## $parameters
## power m m1 fdr w rho k
## 8e-01 1e+04 1e+02 5e-02 1e+00 2e+00 1e+00
names <- as.character(genelist$Gene.name[genelist$Gene.name %in% names(gene_readcounts)])
to_drop <- names[which(gene_readcounts < 10)]
ss_drop10 <- sample_size_distribution(f=0.1,distributionObject=distribution,selectedGenes=names(which(gene_readcounts>=10)),storeProcess = TRUE)
# 76.7% power with N=9, 83.1% power with N=10
ss_drop10
## $iter
## [1] 7
##
## $f.root
## [1] 0.003185019
##
## $root
## [1] 9
##
## $process
## N Power
## [1,] 1 0.0166177
## [2,] 5 0.4009260
## [3,] 7 0.6373315
## [4,] 8 0.7337988
## [5,] 9 0.8031850
## [6,] 17 0.9867463
## [7,] 33 0.9960203
##
## $parameters
## power m m1 fdr w rho k
## 8e-01 1e+04 1e+02 1e-01 1e+00 2e+00 1e+00
to_drop80 <- names[which(gene_readcounts < 80)]
ss_drop80 <- sample_size_distribution(f=0.1,distributionObject=distribution,selectedGenes=names(which(gene_readcounts>=80)),storeProcess = TRUE)
# basically when rho goes up and genes go down it works
# so what exactly is rho?
# basically effect size... so question: do we expect it to be bigger or smaller than in the 2 group situation?
ss_drop80
## $iter
## [1] 7
##
## $f.root
## [1] 0.04811613
##
## $root
## [1] 9
##
## $process
## N Power
## [1,] 1 0.01870999
## [2,] 5 0.44171403
## [3,] 7 0.68761390
## [4,] 8 0.78338012
## [5,] 9 0.84811613
## [6,] 17 0.99309292
## [7,] 33 0.99600706
##
## $parameters
## power m m1 fdr w rho k
## 8e-01 1e+04 1e+02 1e-01 1e+00 2e+00 1e+00
to_drop200 <- names[which(gene_readcounts < 200)]
ss_drop200 <- sample_size_distribution(f=0.1,distributionObject=distribution,selectedGenes=names(which(gene_readcounts>=200)),storeProcess = TRUE)
ss_drop80_fold2.5 <- sample_size_distribution(f=0.1,rho=2.5,distributionObject=distribution,selectedGenes=names(which(gene_readcounts>=80)),storeProcess = TRUE)
ss_drop80_fold2.5
## $iter
## [1] 7
##
## $f.root
## [1] 0.04257095
##
## $root
## [1] 5
##
## $process
## N Power
## [1,] 1 0.05866677
## [2,] 3 0.49404035
## [3,] 4 0.70411021
## [4,] 5 0.84257095
## [5,] 9 0.99070871
## [6,] 17 0.99600868
## [7,] 33 0.99600717
##
## $parameters
## power m m1 fdr w rho k
## 0.8 10000.0 100.0 0.1 1.0 2.5 1.0
kept <- append(rownames(genelist[grepl("CDH[0-9]+$", genelist$Gene.name), ]), rownames(genelist[grepl("ITGA[0-9]+$", genelist$Gene.name), ]))
genelist[kept,]$Gene.name[which(!genelist[kept,]$Gene.stable.ID %in% names(distribution$pseudo.counts.mean))]
## [1] "CDH1" "CDH2" "CDH5" "CDH3" "CDH13" "CDH11" "CDH17" "CDH15"
## [9] "CDH4" "CDH12" "CDH6" "CDH10" "CDH16" "CDH23" "CDH18" "CDH8"
## [17] "CDH9" "CDH7" "CDH22" "CDH20" "CDH19" "CDH26" "CDH24" "ITGA7"
## [25] "ITGA6" "ITGA4" "ITGA5" "ITGA2" "ITGA3" "ITGA1" "ITGA9" "ITGA11"
## [33] "ITGA8" "ITGA10"
kept_final <- kept[which(genelist[kept,]$Gene.name %in% names(distribution$pseudo.counts.mean))]
power_kept <- est_power_distribution(n=6,f=0.1,m=52000,m1=4000,distributionObject=distribution,selectedGenes=genelist[kept_final,]$Gene.name,storeProcess = TRUE)
mean(power_kept$power) #32% power with just kept genes and 6 samples each group
## [1] 0.4895173
ss_kept <- sample_size_distribution(f=0.1,rho=2,distributionObject=distribution,selectedGenes=genelist[kept,]$Gene.name,storeProcess=TRUE)
## Warning in selecteGeneByName(selectedGenes, distributionObject): 17 selectedGenes were not found in distributionObject, discard them for further analysis
ss_kept
## $iter
## [1] 7
##
## $f.root
## [1] 0.01177878
##
## $root
## [1] 14
##
## $process
## N Power
## [1,] 1 0.01292141
## [2,] 9 0.64958749
## [3,] 13 0.79281187
## [4,] 14 0.81177878
## [5,] 15 0.82756311
## [6,] 17 0.85210797
## [7,] 33 0.93997869
##
## $parameters
## power m m1 fdr w rho k
## 8e-01 1e+04 1e+02 1e-01 1e+00 2e+00 1e+00
found <- genelist[which(genelist$Gene.name %in% names(distribution$pseudo.counts.mean)),]
nrow(found)
## [1] 45
found <- genelist[which(genelist$Gene.name %in% names(distribution$pseudo.counts.mean)),]
found
## Gene.name Gene.stable.ID
## 1 CDH1 GC16P068737
## 3 CDH5 GC16P066366
## 4 CDH3 GC16P068637
## 8 CDH15 GC16P089171
## 12 CDH10 GC05M024522
## 15 CDH18 GC05M019508
## 19 CDH22 GC20M046173
## 22 CDH26 GC20P059958
## 23 CELSR1 GC22M046360
## 24 CDHR3 GC07P105876
## 25 CDH24 GC14M023047
## 26 CELSR2 GC01P109250
## 27 CELSR3 GC03M048641
## 29 CDHR2 GC05P176542
## 33 CPED1 GC07P120988
## 37 LOC100130345 GC03M063749
## 38 ITGA7 GC12M055684
## 39 ITGB1 GC10M033645
## 40 ITGB3 GC17P062247
## 41 ITGA6 GC02P172427
## 42 ITGAV GC02P186589
## 43 ITGB2 GC21M044885
## 44 ITGB4 GC17P075721
## 46 ITGA5 GC12M055005
## 47 ITGA2 GC05P052989
## 48 ITGA3 GC17P050055
## 49 ITGAL GC16P030472
## 50 ITGB6 GC02M160099
## 51 ITGA2B GC17M051904
## 52 ITGB7 GC12M053191
## 53 ITGA1 GC05P052788
## 54 ITGAM GC16P042850
## 55 ITGB5 GC03M124761
## 56 ILK GC11P006604
## 57 ITGAX GC16P042860
## 58 ITGA9 GC03P037468
## 59 ITGAE GC17M006905
## 60 ITGB8 GC07P020329
## 61 ITGB1BP1 GC02M009391
## 65 ITGA10 GC01M145891
## 67 ITGB1BP2 GC0XP071333
## 68 IBSP GC04P087799
## 69 ITGB3BP GC01M063440
## 70 ITFG2 GC12P002812
## 71 ITFG1 GC16M047156
n <- 1:15
gene_powers <- lapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000,selectedGenes=found$Gene.name,storeProcess=TRUE)$power)
num_genes <- sapply(gene_powers,function(x) sum(x>0.8))
gene_powers_keptfinal <- lapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000,selectedGenes=genelist[kept_final,]$Gene.name,storeProcess=TRUE)$power)
# CDH1:
musk_id <- which(found$Gene.name=='CDH1')
musk_power <- sapply(gene_powers, function(x) x[musk_id])
total_power <- sapply(gene_powers, function(x) mean(x))
total_keptfinal <- sapply(gene_powers_keptfinal, function(x) mean(x))
# Number of genes vs number of samples at power=0.8
d <- data.frame(n,num_genes)
ggplot(data=d,aes(x=n,y=num_genes,label=num_genes)) + geom_line() + geom_point() + theme_bw() + ylab("Number of genes with power > 0.8") + xlab("Number of samples in each group") + ggtitle("Number of genes of interest with sufficient power to detect\n as number of samples increases") + labs(caption="FDR=0.1,log fold change=2") + geom_text(data=subset(d, n>7),vjust=0,nudge_y=1) + scale_x_continuous(breaks=seq(1,15,2)) + scale_y_continuous(breaks=seq(0,70,10))
# Gene powers random
gene_powers_random <- sapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000))
# Power to detect musk and total power vs number of samples
d2 <- data.frame(n,musk_power,total_power,gene_powers_random) %>% tidyr::gather("type","power",musk_power,total_power,gene_powers_random)
ggplot(data=d2,aes(x=n)) + geom_line(aes(y=power,color=type)) + geom_point(aes(y=power,color=type)) + theme_bw() + ylab("Power") + xlab("Number of samples in each group") + ggtitle("Power for CDH1 gene and total power\n as number of samples increases") + labs(caption="FDR=0.1,log fold change=2,number of random genes=100") + geom_hline(yintercept=0.8,linetype='dashed') + scale_color_discrete(name="Power",breaks=c("musk_power","total_power","gene_powers_random"),labels=c("CDH1","Total for interested genes","Random genes")) + scale_x_continuous(breaks=seq(1,15,2)) + scale_y_continuous(breaks=seq(0,1,0.2))
length(names(which(distribution$pseudo.counts.mean>=10)))
## [1] 14065
found <- genelist[which(genelist$Gene.name %in% names(distribution$pseudo.counts.mean)),]
found
## Gene.name Gene.stable.ID
## 1 CDH1 GC16P068737
## 3 CDH5 GC16P066366
## 4 CDH3 GC16P068637
## 8 CDH15 GC16P089171
## 12 CDH10 GC05M024522
## 15 CDH18 GC05M019508
## 19 CDH22 GC20M046173
## 22 CDH26 GC20P059958
## 23 CELSR1 GC22M046360
## 24 CDHR3 GC07P105876
## 25 CDH24 GC14M023047
## 26 CELSR2 GC01P109250
## 27 CELSR3 GC03M048641
## 29 CDHR2 GC05P176542
## 33 CPED1 GC07P120988
## 37 LOC100130345 GC03M063749
## 38 ITGA7 GC12M055684
## 39 ITGB1 GC10M033645
## 40 ITGB3 GC17P062247
## 41 ITGA6 GC02P172427
## 42 ITGAV GC02P186589
## 43 ITGB2 GC21M044885
## 44 ITGB4 GC17P075721
## 46 ITGA5 GC12M055005
## 47 ITGA2 GC05P052989
## 48 ITGA3 GC17P050055
## 49 ITGAL GC16P030472
## 50 ITGB6 GC02M160099
## 51 ITGA2B GC17M051904
## 52 ITGB7 GC12M053191
## 53 ITGA1 GC05P052788
## 54 ITGAM GC16P042850
## 55 ITGB5 GC03M124761
## 56 ILK GC11P006604
## 57 ITGAX GC16P042860
## 58 ITGA9 GC03P037468
## 59 ITGAE GC17M006905
## 60 ITGB8 GC07P020329
## 61 ITGB1BP1 GC02M009391
## 65 ITGA10 GC01M145891
## 67 ITGB1BP2 GC0XP071333
## 68 IBSP GC04P087799
## 69 ITGB3BP GC01M063440
## 70 ITFG2 GC12P002812
## 71 ITFG1 GC16M047156
n <- 1:20
gene_powers <- lapply(n, function(x) est_power_distribution(n=x, f=0.05, rho=2, distributionObject=distribution, m=14065,m1=78,selectedGenes=found$Gene.name,storeProcess=TRUE)$power)
num_genes <- sapply(gene_powers,function(x) sum(x>0.8))
gene_powers_keptfinal <- lapply(n, function(x) est_power_distribution(n=x, f=0.05, rho=2, distributionObject=distribution, m=14065,m1=78,selectedGenes=genelist[kept_final,]$Gene.name,storeProcess=TRUE)$power)
# CDH1:
musk_id <- which(found$Gene.name=='CDH1')
musk_power <- sapply(gene_powers, function(x) x[musk_id])
total_power <- sapply(gene_powers, function(x) mean(x))
total_keptfinal <- sapply(gene_powers_keptfinal, function(x) mean(x))
# Number of genes vs number of samples at power=0.8
d <- data.frame(n,num_genes)
ggplot(data=d,aes(x=n,y=num_genes,label=num_genes)) + geom_line() + geom_point() + theme_bw() + ylab("Number of genes with power > 0.8") + xlab("Number of samples in each group") + ggtitle("Number of genes of interest with sufficient power to detect\n as number of samples increases") + labs(caption="FDR=0.1,log fold change=2") + geom_text(data=subset(d, n>7),vjust=0,nudge_y=1) + scale_x_continuous(breaks=seq(1,15,2)) + scale_y_continuous(breaks=seq(0,70,10))
# Gene powers random
gene_powers_random <- sapply(n, function(x) est_power_distribution(n=x, f=0.05, rho=2, distributionObject=distribution, m=14065,m1=78))
# Power to detect musk and total power vs number of samples
d2 <- data.frame(n,musk_power,total_power,gene_powers_random) %>% tidyr::gather("type","power",musk_power,total_power,gene_powers_random)
ggplot(data=d2,aes(x=n)) + geom_line(aes(y=power,color=type)) + geom_point(aes(y=power,color=type)) + theme_bw() + ylab("Power") + xlab("Number of samples in each group") + ggtitle("Power for CDH1 gene and total power\n as number of samples increases") + labs(caption="FDR=0.1,log fold change=2,number of random genes=100") + geom_hline(yintercept=0.8,linetype='dashed') + scale_color_discrete(name="Power",breaks=c("musk_power","total_power","gene_powers_random"),labels=c("CDH1","Total for interested genes","Random genes")) + scale_x_continuous(breaks=seq(1,20,2)) + scale_y_continuous(breaks=seq(0,1,0.2))